/* ///////////////////////////////////////////////////////////////////////// */
/*  This is part of the source of the OMAP 5912 heterogeneous dual-core      */
/*  MPEG-4 SP video decoder published in ACM Transactions on Embedded        */
/*  Computing Systems, Vol. X, Issue Y.                                      */
/* ------------------------------------------------------------------------- */
/*  The source code is released under GPL license.                           */
/*                                                                           */
/*  Copyright, 2011                                                          */
/*  Multimedia Embedded Systems Labs                                         */
/*  Dept. of Computer Science                                                */
/*  National Chiao Tung University                                           */
/*  Hsinchu, Taiwan.                                                         */
/* ///////////////////////////////////////////////////////////////////////// */

#include <stdio.h>
#include <stdlib.h>
#include <string.h> // for memset

#include "metypes.h"

#include "m4vdec_api.h"
#include "bitstream.h"
#include "mbcoding.h"

#include "quant_h263.h"
#include "idct.h"
#include "mem_transfer.h"
#include "bilinear8x8.h"

#include "mbprediction.h"

#include "image.h"
#include "get_time.h"
#include "mem_address.h"
#define DECODER_MAX_MB_WIDTH 20
#define DECODER_MAX_MB_HEIGH 15
long    intra_block_time = 0, mv_time = 0, header_time = 0;
long    start_time, stop_time;

// DMA API Include
#include "dma_api_DSP.h"
#include "reference_block_move_in.h"
extern volatile int flag[6];
FuncPtr_interpolation funptr_pixinter[4];
FuncPtr_DMA_3column FuncPtr_DMA[DECODER_MAX_MB_HEIGH];
FuncPtr_DMA_1column FuncPtr_DMA_one_column[DECODER_MAX_MB_HEIGH][DECODER_MAX_MB_WIDTH];
//#define _PROFILING_ 1

//#define LCD_OUT

xint
m4v_decode_header(DECODER * dec, uint8 * video_header, xint header_size)
{
    Bitstream bs;
    uint32  rounding;
    uint32  quant;
    uint32  fcode;
    uint32  intra_dc_threshold;
    BitstreamInit(&bs, video_header, header_size);
    BitstreamReadHeaders(&bs, dec, &rounding, &quant, &fcode, &intra_dc_threshold);
    return 0;
}

xint
m4v_init_decoder(DEC_CTRL * param, uint8 * video_header, xint header_size)
{
    DECODER *dec;
    int     i, j;

    dec = malloc(sizeof(DECODER));

    if (dec == NULL)
    {
        return 1;
    }
    param->handle = dec;

    /* decode video header for frame width & height */
    m4v_decode_header(dec, video_header, header_size);
    param->width = dec->width;
    param->height = dec->height;
    dec->mb_width = (dec->width + MB_SIZE - 1) / MB_SIZE;
    dec->mb_height = (dec->height + MB_SIZE - 1) / MB_SIZE;
    dec->num_mb = dec->mb_height * dec->mb_width;
    dec->nbits_mba = log2bin(dec->num_mb - 1);
    dec->edged_width = MB_SIZE * dec->mb_width + 2 * EDGE_SIZE;
    dec->edged_height = MB_SIZE * dec->mb_height + 2 * EDGE_SIZE;
    dec->decoder_clock = 0;
    dec->slice = malloc((dec->mb_width + 1) * (dec->mb_height + 1) * 2);
    if (dec->slice == NULL)
    {
        return 1;
    }

    dec->cur.y = (uint8 *) ((uint32) cur_y_address);
    dec->cur.u = (uint8 *) ((uint32) cur_u_address);
    dec->cur.v = (uint8 *) ((uint32) cur_v_address);
    dec->refn.y = (uint8 *) ((uint32) refn_y_address);
    dec->refn.u = (uint8 *) ((uint32) refn_u_address);
    dec->refn.v = (uint8 *) ((uint32) refn_v_address);

    dec->mbs = (MACROBLOCK *) ((uint32) macroblock_buffer_address);

    dec->local_cur.y = (uint8 *) ((uint32) local_cur_y_address);
    dec->local_cur.u = (uint8 *) ((uint32) local_cur_u_address);
    dec->local_cur.v = (uint8 *) ((uint32) local_cur_v_address);

    dec->local_refn.y = (uint8 *) ((uint32) local_refn_y_address);
    dec->local_refn.u = (uint8 *) ((uint32) local_refn_u_address);
    dec->local_refn.v = (uint8 *) ((uint32) local_refn_v_address);

    dec->local_refn_start.y = dec->local_refn.y;    //2008/1/10 04:44pm
    dec->local_refn_start.u = dec->local_refn.u;
    dec->local_refn_start.v = dec->local_refn.v;

    dec->local_refn.y += (local_ref_column_size * 16) * REF_UP_EDGE + EDGE_SIZE;    //2008/1/10 04:44pm
    dec->local_refn.u +=
        (local_ref_column_size * 8) * REF_UP_EDGE_2 + EDGE_SIZE2;
    dec->local_refn.v +=
        (local_ref_column_size * 8) * REF_UP_EDGE_2 + EDGE_SIZE2;

    init_timer(); //2007/11/16 07:30pm
    init_vlc_tables();
    funptr_pixinter[0] = transfer8x8_copy;
    funptr_pixinter[1] = HW_halfpel8x8_v;
    funptr_pixinter[2] = HW_halfpel8x8_h;
    funptr_pixinter[3] = HW_halfpel8x8_hv;

    /* The size of IRMB is 3x8 MBs */
    /* set up functions to fill 3x3 MBs to IRMB buffer using DMA */
    for (i = 1; i < dec->mb_height - 1; i++)
    {
        FuncPtr_DMA[i] = fun_DMA_3column_not08;
    }
    FuncPtr_DMA[0] = fun_DMA_3column_0;
    FuncPtr_DMA[dec->mb_height - 1] = fun_DMA_3column_8;

    /* set up functions to move 3x1 MBs to IRMB buffer using DMA */
    /* For the first row, when y = 0 */
    for (i = 0; i < dec->mb_width - 1; i++)
    {
        if ((i % 6) == 0) /* reset every 6 columns (8-2 = 6) */
        {
            FuncPtr_DMA_one_column[0][i] = fun_DMA_1column_y0_x6;
        }
        else
        {
            FuncPtr_DMA_one_column[0][i] = fun_DMA_1column_y0_xnot610;
        }
    }
    FuncPtr_DMA_one_column[0][dec->mb_width - 1] = fun_DMA_1column_y0_x10;

    /* For the last row, when y = mb_height - 1 */
    for (i = 0; i < dec->mb_width - 1; i++)
    {
        if ((i % 6) == 0)
        {
            FuncPtr_DMA_one_column[dec->mb_height - 1][i] =
                fun_DMA_1column_y8_x6;
        }
        else
        {
            FuncPtr_DMA_one_column[dec->mb_height - 1][i] =
                fun_DMA_1column_y8_xnot610;
        }
    }
    FuncPtr_DMA_one_column[dec->mb_height - 1][dec->mb_width - 1] =
        fun_DMA_1column_y8_x10;

    /* For the middle rows */
    for (j = 1; j < dec->mb_height - 1; j++)
    {
        for (i = 0; i < dec->mb_width - 1; i++)
        {
            if ((i % 6) == 0)
            {
                FuncPtr_DMA_one_column[j][i] = fun_DMA_1column_y17_x6;
            }
            else
            {
                FuncPtr_DMA_one_column[j][i] = fun_DMA_1column_y17_xnot610;
            }
        }
        FuncPtr_DMA_one_column[j][dec->mb_width - 1] = fun_DMA_1column_y17_x10;
    }

    initialize_clip_table();
    return 0;
}

xint
m4v_free_decoder(DEC_CTRL *vdec_obj)
{
    return 0;
}

static const int32 dquant_table[4] = {
    -1, -2, 1, 2
};

// decode an intra macroblock
int16   block[6 * 64];
int16   data[6 * 64];

void
decoder_mbintra(DECODER * dec,
                MACROBLOCK * pMB,
                const uint32 x_pos,
                const uint32 y_pos,
                const uint32 acpred_flag,
                const uint32 cbp,
                Bitstream * bs,
                const uint32 quant, const uint32 intra_dc_threshold)
{
    uint32  i;
    uint32  iQuant = pMB->quant;
    uint8  *pY_Cur, *pU_Cur, *pV_Cur;

    memset(block, 0, 6 * 64 * sizeof(int16));
    for (i = 0; i < 6; i++)
    {
        //iDcScaler = get_dc_scaler(iQuant, (i < 4) ? 1 : 0);
        uint32  iDcScaler = get_dc_scaler(iQuant, (i < 4) ? 1 : 0);
        //uint32 tempdata;
        int16   predictors[8];
        int32   start_coeff;
#if defined(_PROFILING_)
        start_timer();
#endif
        predict_acdc(dec->mbs, x_pos, y_pos, dec->mb_width, i, &block[i * 64],
                     iQuant, iDcScaler, predictors, dec->slice);
        if (!acpred_flag)
        {
            pMB->acpred_directions[i] = 0;
        }
#if defined(_PROFILING_)
        stop_prediction_timer();    //DC/AC Prediction
#endif
        if (quant < intra_dc_threshold)
        {
            int32   dc_size;
            int32   dc_dif;

            dc_size = i < 4 ? get_dc_size_lum(bs) : get_dc_size_chrom(bs);
            dc_dif = dc_size ? get_dc_dif(bs, dc_size) : 0;

            if (dc_size > 8)
            {
                BitstreamSkip(bs, 1);   // marker
            }

            block[i * 64] = dc_dif;
            start_coeff = 1;
        }
        else
        {
            start_coeff = 0;
        }
#if defined(_PROFILING_)
        start_timer();
#endif
        if (cbp & (1 << (5 - i)))   // coded
        {
            get_intra_block(bs, &block[i * 64], pMB->acpred_directions[i],
                            start_coeff);
        }
#if defined(_PROFILING_)
        stop_coding_timer();    //VLC Decoding
#endif

#if defined(_PROFILING_)
        start_timer();
#endif

        add_acdc(pMB, i, &block[i * 64], iDcScaler, predictors);
#if defined(_PROFILING_)
        stop_prediction_timer();
#endif

#if defined(_PROFILING_)
        start_timer();
#endif

        dequant_intra(&data[i * 64], &block[i * 64], iQuant, iDcScaler);
#if defined(_PROFILING_)
        stop_iquant_timer();
#endif

#if defined(_PROFILING_)
        start_timer();
#endif
        //idct(&data[i * 64]);
        idct_HW(&data[i * 64]);

#if defined(_PROFILING_)
        stop_idct_timer();
#endif
    }
#if defined(_PROFILING_)
    start_timer();
#endif
    pY_Cur = dec->local_cur.y + ((x_pos % (local_cur_column_size)) << 4);
    pU_Cur = dec->local_cur.u + ((x_pos % (local_cur_column_size)) << 3);
    pV_Cur = dec->local_cur.v + ((x_pos % (local_cur_column_size)) << 3);

    transfer_16to8copy(pY_Cur, &data[0 * 64], local_cur_column_size * 16);
    transfer_16to8copy(pY_Cur + 8, &data[1 * 64], local_cur_column_size * 16);
    transfer_16to8copy(pY_Cur + local_cur_column_size * 16 * 8, &data[2 * 64],
                       local_cur_column_size * 16);
    transfer_16to8copy(pY_Cur + 8 + local_cur_column_size * 16 * 8,
                       &data[3 * 64], local_cur_column_size * 16);
    transfer_16to8copy(pU_Cur, &data[4 * 64], local_cur_column_size * 8);
    transfer_16to8copy(pV_Cur, &data[5 * 64], local_cur_column_size * 8);
#if defined(_PROFILING_)
    stop_transfer_timer();
#endif
}

#define SIGN(X) (((X)>0)?1:-1)
#define ABS(X) (((X)>0)?(X):-(X))
static const uint32 roundtab[16] =
    { 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2 };

// decode an inter macroblock
void
decoder_mbinter(DECODER * dec,
                const MACROBLOCK * pMB,
                const uint32 x_pos,
                const uint32 y_pos,
                const uint32 acpred_flag,
                const uint32 cbp,
                Bitstream * bs, const uint32 quant, const uint32 rounding)
{
    uint32  stride = dec->edged_width;
    uint32  stride2 = stride / 2;
    uint32  i;
    uint32  iQuant = pMB->quant;
    uint8  *pY_Cur, *pU_Cur, *pV_Cur;
    int32   uv_dx, uv_dy;
    int     dx, dy;
    int     temp_for_x_pos;

    if (pMB->mode == MODE_INTER || pMB->mode == MODE_INTER_Q)
    {
        uv_dx = pMB->mvs[0].x;
        uv_dy = pMB->mvs[0].y;

        uv_dx = (uv_dx & 3) ? (uv_dx >> 1) | 1 : uv_dx / 2;
        uv_dy = (uv_dy & 3) ? (uv_dy >> 1) | 1 : uv_dy / 2;
    }
    else
    {
        int32   sum;
        sum = pMB->mvs[0].x + pMB->mvs[1].x + pMB->mvs[2].x + pMB->mvs[3].x;
        uv_dx =
            (sum ==
             0 ? 0 : SIGN(sum) * (roundtab[ABS(sum) % 16] +
                                  (ABS(sum) / 16) * 2));

        sum = pMB->mvs[0].y + pMB->mvs[1].y + pMB->mvs[2].y + pMB->mvs[3].y;
        uv_dy =
            (sum ==
             0 ? 0 : SIGN(sum) * (roundtab[ABS(sum) % 16] +
                                  (ABS(sum) / 16) * 2));
    }
#if defined(_PROFILING_)
    start_timer();
#endif
    pY_Cur = dec->local_cur.y + ((x_pos % (local_cur_column_size)) << 4);
    pU_Cur = dec->local_cur.u + ((x_pos % (local_cur_column_size)) << 3);
    pV_Cur = dec->local_cur.v + ((x_pos % (local_cur_column_size)) << 3);

    temp_for_x_pos = (x_pos % ((local_ref_column_size - 2)));   //2008/1/10 05:16pm x=6=>x=0  x=7=>x=1        
    stride = local_ref_column_size << 4;    //*16
    stride2 = stride >> 1;
    //tempx=16 * temp_for_x_pos;
    dx = pMB->mvs[0].x;
    dy = pMB->mvs[0].y;
    funptr_pixinter[((dx & 1) << 1) + (dy & 1)] (pY_Cur,
                                                 dec->local_refn.y +
                                                 ((dy -
                                                   (dy & 1)) / 2) * stride +
                                                 (16 * temp_for_x_pos) +
                                                 ((dx - (dx & 1)) / 2), stride,
                                                 local_cur_column_size * 16,
                                                 1 - rounding);
    //tempx=16 * temp_for_x_pos+8;
    dx = pMB->mvs[1].x;
    dy = pMB->mvs[1].y;
    funptr_pixinter[((dx & 1) << 1) + (dy & 1)] (pY_Cur + 8,
                                                 dec->local_refn.y +
                                                 ((dy -
                                                   (dy & 1)) / 2) * stride +
                                                 (16 * temp_for_x_pos + 8) +
                                                 ((dx - (dx & 1)) / 2), stride,
                                                 local_cur_column_size * 16,
                                                 1 - rounding);

    //tempx=16 * temp_for_x_pos;
    dx = pMB->mvs[2].x;
    dy = pMB->mvs[2].y;
    funptr_pixinter[((dx & 1) << 1) + (dy & 1)] (pY_Cur +
                                                 local_cur_column_size * 16 * 8,
                                                 dec->local_refn.y +
                                                 (stride * 8) +
                                                 ((dy -
                                                   (dy & 1)) / 2) * stride +
                                                 (16 * temp_for_x_pos) +
                                                 ((dx - (dx & 1)) / 2), stride,
                                                 local_cur_column_size * 16,
                                                 1 - rounding);

    //tempx=16 * temp_for_x_pos+8;
    dx = pMB->mvs[3].x;
    dy = pMB->mvs[3].y;
    funptr_pixinter[((dx & 1) << 1) + (dy & 1)] (pY_Cur +
                                                 local_cur_column_size * 16 *
                                                 8 + 8,
                                                 dec->local_refn.y +
                                                 (stride * 8) +
                                                 ((dy -
                                                   (dy & 1)) / 2) * stride +
                                                 (16 * temp_for_x_pos + 8) +
                                                 ((dx - (dx & 1)) / 2), stride,
                                                 local_cur_column_size * 16,
                                                 1 - rounding);

    //tempx=8 * temp_for_x_pos;
    funptr_pixinter[((uv_dx & 1) << 1) + (uv_dy & 1)] (pU_Cur,
                                                       dec->local_refn.u +
                                                       ((uv_dy -
                                                         (uv_dy & 1)) / 2) *
                                                       stride2 +
                                                       (8 * temp_for_x_pos) +
                                                       ((uv_dx -
                                                         (uv_dx & 1)) / 2),
                                                       stride2,
                                                       local_cur_column_size *
                                                       8, 1 - rounding);
    funptr_pixinter[((uv_dx & 1) << 1) + (uv_dy & 1)] (pV_Cur,
                                                       dec->local_refn.v +
                                                       ((uv_dy -
                                                         (uv_dy & 1)) / 2) *
                                                       stride2 +
                                                       (8 * temp_for_x_pos) +
                                                       ((uv_dx -
                                                         (uv_dx & 1)) / 2),
                                                       stride2,
                                                       local_cur_column_size *
                                                       8, 1 - rounding);
#if defined(_PROFILING_)
    stop_comp_timer();
#endif
///////////////////////////////new end
    for (i = 0; i < 6; i++)
    {
        if (cbp & (1 << (5 - i)))   // coded
        {
            memset(&block[i * 64], 0, 64 * sizeof(int16));  // clear
#if defined(_PROFILING_)
            start_timer();
#endif
            get_inter_block(bs, &block[i * 64]);

#if defined(_PROFILING_)
            stop_coding_timer();    //VLC Decoding
#endif

#if defined(_PROFILING_)
            start_timer();
#endif
            dequant_inter(&data[i * 64], &block[i * 64], iQuant);

#if defined(_PROFILING_)
            stop_iquant_timer();
#endif

#if defined(_PROFILING_)
            start_timer();
#endif
            //idct(&data[i * 64]);
            idct_HW(&data[i * 64]);

#if defined(_PROFILING_)
            stop_idct_timer();
#endif
        }
    }
#if defined(_PROFILING_)
    start_timer();
#endif
    if (cbp & 32)
        transfer_16to8add(pY_Cur, &data[0 * 64], local_cur_column_size * 16);
    if (cbp & 16)
        transfer_16to8add(pY_Cur + 8, &data[1 * 64],
                          local_cur_column_size * 16);
    if (cbp & 8)
        transfer_16to8add(pY_Cur + local_cur_column_size * 16 * 8,
                          &data[2 * 64], local_cur_column_size * 16);
    if (cbp & 4)
        transfer_16to8add(pY_Cur + 8 + local_cur_column_size * 16 * 8,
                          &data[3 * 64], local_cur_column_size * 16);
    if (cbp & 2)
        transfer_16to8add(pU_Cur, &data[4 * 64], local_cur_column_size * 8);
    if (cbp & 1)
        transfer_16to8add(pV_Cur, &data[5 * 64], local_cur_column_size * 8);
#if defined(_PROFILING_)
    stop_transfer_timer();
#endif
    //////////////////new end
}

/* ========================================================================= */
/*    Function : decode_video_packet_header()                                */
/*    Author   : CJ Tsai,                                                    */
/*    Date     : Feb/04/2003                                                 */
/* ------------------------------------------------------------------------- */
/*    The function handles HEX code, but does not make use of the redundant  */
/*    information.  It also does not handles B frame video packet headers.   */
/* ========================================================================= */
void    __inline
decode_video_packet_header(DECODER * dec, Bitstream * bs, xint fcode,
                           xint * quant)
{
    xint    resync_marker_length = (fcode) ? fcode + 16 : 17;

    BitstreamByteAlign(bs);
    BitstreamGetBits(bs, resync_marker_length); // resync marker
    BitstreamGetBits(bs, dec->nbits_mba);   // macroblock_number
    *quant = BitstreamGetBits(bs, dec->quant_bits); // quant_scale
    if (BitstreamGetBit(bs))    // header_extension_code flag
    {
        /* modulo_time_base */
        while (BitstreamGetBits(bs, 1)) /* do nothing */ ;
        READ_MARKER();

        /* vop_time_increment (1-16 bits) */
        BitstreamGetBits(bs, dec->time_inc_bits);
        READ_MARKER();

        /* vop_prediction_type (2 bits) */
        BitstreamGetBits(bs, 2);

        /* intra_dc_vlc_thr */
        BitstreamGetBits(bs, 3);

        /* fcode */
        if (fcode)
            BitstreamGetBits(bs, 3);
    }
}

uint __inline
decode_video_packet_header_no_aligen(DECODER * dec, Bitstream * bs, xint fcode,
                                     xint * quant)
{
    xint mb_idx;
    xint resync_marker_length = (fcode) ? fcode + 16 : 17;

    //BitstreamByteAlign(bs);
    BitstreamGetBits(bs, resync_marker_length); // resync marker
    mb_idx = BitstreamGetBits(bs, dec->nbits_mba);   // macroblock_number
    *quant = BitstreamGetBits(bs, dec->quant_bits); // quant_scale
    if (BitstreamGetBit(bs))    // header_extension_code flag
    {
        /* modulo_time_base */
        while (BitstreamGetBits(bs, 1)) /* do nothing */ ;
        READ_MARKER();

        /* vop_time_increment (1-16 bits) */
        BitstreamGetBits(bs, dec->time_inc_bits);
        READ_MARKER();

        /* vop_prediction_type (2 bits) */
        BitstreamGetBits(bs, 2);

        /* intra_dc_vlc_thr */
        BitstreamGetBits(bs, 3);

        /* fcode */
        if (fcode)
            BitstreamGetBits(bs, 3);
    }
    return mb_idx;
}

void
decoder_islice(DEC_CTRL *vdec_obj, Bitstream * bs, xint quant,
               xint intra_dc_threshold, int slice_num)
{
    DECODER *dec = (DECODER *) vdec_obj->handle;
    uint    resync_marker_length = 17;
    uint    mb_idx, mb_row, slice_idx, slice_no;
    uint    x, y;

    /* we need to keep previous QP for intra_dc_vlc_thr.  CJ Tsai 03/02/2003 */
    xint    running_qp;
    dma_port src_obj, dst_obj;
    dma_channel channel_obj;

    running_qp = quant;

    if (slice_num != 0)
    {
        mb_row = decode_video_packet_header_no_aligen(dec, bs, 0, &quant);
        mb_row /= dec->mb_width;
        running_qp = quant;
    }

    slice_no = slice_num + 1;
    mb_idx = 0;
    slice_idx = dec->mb_width + 2 + (dec->mb_width + 1) * mb_row;

    for (y = mb_row; y < mb_row + ROW_PER_SLICE; y++)
    {
        for (x = 0; x < dec->mb_width; x++, mb_idx++, slice_idx++)
        {
            MACROBLOCK *mb = &dec->mbs[mb_idx];
            uint32  mcbpc;
            uint32  cbpc;
            uint32  acpred_flag;
            uint32  cbpy;
            uint32  cbp;

            dec->slice[slice_idx] = slice_no;

            /* Decode one combined mode macroblock */
            mcbpc = get_mcbpc_intra(bs);
            mb->mode = mcbpc & 7;
            cbpc = (mcbpc >> 4);

            acpred_flag = BitstreamGetBit(bs);
            /*
               mb_type              Name
               not coded    -
               0                    INTER
               1                    INTER+Q
               3                    INTRA
               4                    INTRA+Q
               stuffing             -
             */
            if (mb->mode == MODE_STUFFING)
            {
                // DEBUG("-- STUFFING ?");
                continue;
            }

            cbpy = get_cbpy(bs, 1);
            cbp = (cbpy << 2) | cbpc;

            mb->quant = running_qp;
            if (mb->mode == MODE_INTRA_Q)
            {
                mb->quant += dquant_table[BitstreamGetBits(bs, 2)];
                if (mb->quant > 31)
                {
                    mb->quant = 31;
                }
                else if (mb->quant < 1)
                {
                    mb->quant = 1;
                }
            }

            decoder_mbintra(dec, mb, x, y, acpred_flag, cbp, bs,
                            running_qp, intra_dc_threshold);
            running_qp = mb->quant;

            /* decode video packet header, if any */
            if (BitstreamShowBitsByteAlign(bs, resync_marker_length) == 1)
            {
                decode_video_packet_header(dec, bs, 0, &quant);
                running_qp = quant;
                slice_no++;
            }

            if ((x+1) % local_cur_column_size == 0)
            {
#if defined(_PROFILING_)
                start_timer();
#endif
				/* move Luma ICMB to SDRAM */
                src_obj.staddr = (uint32) dec->local_cur.y;
                dst_obj.staddr = (uint32) ((uint32) dec->cur.y +
                                           (uint32) (y << 4) * dec->edged_width) +
                                           (uint32) (x / local_cur_column_size) *
                                                    (local_cur_column_size * 16);

                //(stride-elecnt)*2+1; //local_cur_column_size*16=96
                dst_obj.addressing_frmIndex = (dec->edged_width - 96) * 2 + 1;
                dst_obj.addressing_elmIndex = 1;
                channel_obj.elmCnt = 96;
                channel_obj.frmCnt = 16;
                while (flag[5] == 0);
                channel_obj.channel_number = 5;
                dma_api_ssd_pd(src_obj, dst_obj, channel_obj);

				/* move Chroma U ICMB to SDRAM */
                src_obj.staddr = (uint32) dec->local_cur.u;
                dst_obj.staddr = (uint32)
                	(dec->cur.u + (y << 3) * (dec->edged_width >> 1)) +
                	(x / local_cur_column_size) * (local_cur_column_size * 8);
                //(stride-elecnt)*2+1;
                dst_obj.addressing_frmIndex = ((dec->edged_width >> 1) - 48) * 2 + 1;
                dst_obj.addressing_elmIndex = 1;
                channel_obj.elmCnt = 48;
                channel_obj.frmCnt = 8;
                while (flag[4] == 0);
                channel_obj.channel_number = 4;
                dma_api_ssd_pd(src_obj, dst_obj, channel_obj);

				/* move Chroma V ICMB to SDRAM */
                src_obj.staddr = (uint32) dec->local_cur.v;
                dst_obj.staddr = (uint32) (dec->cur.v +
                							(y << 3) * (dec->edged_width >> 1)) +
                			(x / local_cur_column_size) * (local_cur_column_size * 8);
                //(stride-elecnt)*2+1;
                dst_obj.addressing_frmIndex = ((dec->edged_width >> 1) - 48) * 2 + 1;
                dst_obj.addressing_elmIndex = 1;
                channel_obj.elmCnt = 48;
                channel_obj.frmCnt = 8;
                while (flag[1] == 0);
                channel_obj.channel_number = 1;
                dma_api_ssd_pd(src_obj, dst_obj, channel_obj);

#if defined(_PROFILING_)
                stop_dma_out();
#endif
            }
        }
        slice_idx++;

#if defined(_PROFILING_)
        start_timer();
#endif

        /* move the last few MBs per row in ICMB to SDRAM */
		/* move Luma ICMB to SDRAM */
        src_obj.staddr = (uint32) dec->local_cur.y;
        src_obj.addressing_frmIndex =
            (local_cur_column_size * 16 -
             (x % local_cur_column_size) * 16) * 2 + 1;
        src_obj.addressing_elmIndex = 1;
        dst_obj.staddr = (uint32) ((uint32) dec->cur.y + (uint32) (y << 4) * dec->edged_width) + (uint32) (x / local_cur_column_size) * (local_cur_column_size * 16);   //dy
        dst_obj.addressing_frmIndex = (dec->edged_width - (x % local_cur_column_size) * 16) * 2 + 1;    //(stride-elecnt)*2+1;
        dst_obj.addressing_elmIndex = 1;
        channel_obj.elmCnt = (x % local_cur_column_size) * 16;
        channel_obj.frmCnt = 16;
        channel_obj.channel_number = 5;
        while (flag[5] == 0);
        dma_api_ssd_dd(src_obj, dst_obj, channel_obj);

        /* move Chroma U ICMB to SDRAM */
        src_obj.staddr = (uint32) dec->local_cur.u;
        src_obj.addressing_frmIndex =
            (local_cur_column_size * 8 - (x % local_cur_column_size) * 8) * 2 + 1;
        src_obj.addressing_elmIndex = 1;
        dst_obj.staddr = (uint32) (dec->cur.u + (y << 3) * (dec->edged_width >> 1)) + (x / local_cur_column_size) * (local_cur_column_size * 8);
        dst_obj.addressing_frmIndex = ((dec->edged_width >> 1) - (x % local_cur_column_size) * 8) * 2 + 1;  //(stride-elecnt)*2+1;
        dst_obj.addressing_elmIndex = 1;
        channel_obj.elmCnt = (x % local_cur_column_size) * 8;
        channel_obj.frmCnt = 8;
        channel_obj.channel_number = 4;
        while (flag[4] == 0);
        dma_api_ssd_dd(src_obj, dst_obj, channel_obj);

        /* move Chroma U ICMB to SDRAM */
        src_obj.staddr = (uint32) dec->local_cur.v;
        src_obj.addressing_frmIndex =
            (local_cur_column_size * 8 - (x % local_cur_column_size) * 8) * 2 + 1;
        src_obj.addressing_elmIndex = 1;
        dst_obj.staddr = (uint32) (dec->cur.v + (y << 3) * (dec->edged_width >> 1)) + (x / local_cur_column_size) * (local_cur_column_size * 8);
        dst_obj.addressing_frmIndex = ((dec->edged_width >> 1) - (x % local_cur_column_size) * 8) * 2 + 1;  //(stride-elecnt)*2+1;
        dst_obj.addressing_elmIndex = 1;
        channel_obj.elmCnt = (x % local_cur_column_size) * 8;
        channel_obj.frmCnt = 8;
        channel_obj.channel_number = 1;
        while (flag[1] == 0);
        dma_api_ssd_dd(src_obj, dst_obj, channel_obj);

#if defined(_PROFILING_)
        stop_dma_out();
#endif
    }
}

void
get_motion_vector(DECODER * dec, Bitstream * bs, int32 x, int32 y, int32 k,
                  VECTOR * mv, int32 fcode)
{
    int32   scale_fac = 1 << (fcode - 1);
    int32   high = (32 * scale_fac) - 1;
    int32   low = ((-32) * scale_fac);
    int32   range = (64 * scale_fac);

    VECTOR  pmv[4];
    int32   psad[4];

    int32   mv_x, mv_y;
    int32   pmv_x, pmv_y;

    get_pmvdata(dec->mbs, x, y, dec->mb_width, k, pmv, psad, dec->slice);

    pmv_x = pmv[0].x;
    pmv_y = pmv[0].y;

    mv_x = get_mv(bs, fcode);
    mv_y = get_mv(bs, fcode);

    mv_x += pmv_x;
    mv_y += pmv_y;

    if (mv_x < low)
    {
        mv_x += range;
    }
    else if (mv_x > high)
    {
        mv_x -= range;
    }

    if (mv_y < low)
    {
        mv_y += range;
    }
    else if (mv_y > high)
    {
        mv_y -= range;
    }

    mv->x = mv_x;
    mv->y = mv_y;
}

void
decoder_pslice(DEC_CTRL *vdec_obj, Bitstream *bs, xint rounding, xint quant,
               xint fcode, xint intra_dc_threshold, int slice_num)
{
    DECODER *dec = (DECODER *) vdec_obj->handle;
    uint    resync_marker_length = fcode + 16;
    uint    mb_idx, mb_row, slice_idx, slice_no;
    uint    x, y, z;
    int     temp_size_for_stride;
    int     i;
    uint32  mcbpc;
    uint32  cbpc;
    uint32  acpred_flag;
    uint32  cbpy;
    uint32  cbp;
    uint32  intra;
    uint8  *src, *dst;
    dma_port src_obj, dst_obj;
    dma_channel channel_obj;
    /* we need to keep previous QP for intra_dc_vlc_thr.  CJ Tsai 03/02/2003 */
    xint    running_qp;

    running_qp = quant;

    if (slice_num != 0)
    {
        mb_row = decode_video_packet_header_no_aligen(dec, bs, 0, &quant);
        mb_row /= dec->mb_width;
        running_qp = quant;
    }

    mb_idx = 0;
    slice_no = slice_num + 1;
    slice_idx = dec->mb_width + 2 + (dec->mb_width + 1) * mb_row;

	/* copy the initial 3x3 reference MBs into IRMB */
#if defined(_PROFILING_)
    start_timer();
#endif
    FuncPtr_DMA[mb_row] (dec, mb_row);
#if defined(_PROFILING_)
    stop_dma_in_3();
#endif

    for (y = mb_row; y < mb_row + ROW_PER_SLICE; y++)
    {

        for (x = 0; x < dec->mb_width; x++, mb_idx++, slice_idx++)
        {
            MACROBLOCK *mb = &dec->mbs[mb_idx];
            dec->slice[slice_idx] = slice_no;

            if ((x % (local_cur_column_size) == 0) && x > 0)
            {
#if defined(_PROFILING_)
                start_timer();
#endif
				/* move Luma ICMB to SDRAM */
                src_obj.staddr = (uint32) dec->local_cur.y;
                dst_obj.staddr = (uint32) ((uint32) dec->cur.y +
                                           (uint32) (y << 4) * dec->edged_width) +
                                           (uint32) ((x - 1) / local_cur_column_size) *
                                                          (local_cur_column_size * 16);
				//(stride-elecnt)*2+1; //local_cur_column_size*16=96
                dst_obj.addressing_frmIndex = (dec->edged_width - 96) * 2 + 1;
                dst_obj.addressing_elmIndex = 1;
                channel_obj.elmCnt = 96;
                channel_obj.frmCnt = 16;
                while (flag[5] == 0);
                channel_obj.channel_number = 5;
                dma_api_ssd_pd(src_obj, dst_obj, channel_obj);

				/* move Chroma U ICMB to SDRAM */
                src_obj.staddr = (uint32) dec->local_cur.u;
                dst_obj.staddr = (uint32) (dec->cur.u + (y << 3) * (dec->edged_width >> 1)) +
                                    ((x - 1) / local_cur_column_size) * (local_cur_column_size * 8);
                //(stride-elecnt)*2+1;
                dst_obj.addressing_frmIndex = ((dec->edged_width >> 1) - 48) * 2 + 1;
                dst_obj.addressing_elmIndex = 1;
                channel_obj.elmCnt = 48;
                channel_obj.frmCnt = 8;
                while (flag[4] == 0);
                channel_obj.channel_number = 4;
                dma_api_ssd_pd(src_obj, dst_obj, channel_obj);

				/* move Chroma V ICMB to SDRAM */
                src_obj.staddr = (uint32) dec->local_cur.v;
                dst_obj.staddr = (uint32) (dec->cur.v + (y << 3) * (dec->edged_width >> 1)) +
                                    ((x - 1) / local_cur_column_size) * (local_cur_column_size * 8);
                //(stride-elecnt)*2+1;
                dst_obj.addressing_frmIndex = ((dec->edged_width >> 1) - 48) * 2 + 1;
                dst_obj.addressing_elmIndex = 1;
                channel_obj.elmCnt = 48;
                channel_obj.frmCnt = 8;
                while (flag[1] == 0);
                channel_obj.channel_number = 1;
                dma_api_ssd_pd(src_obj, dst_obj, channel_obj);

#if defined(_PROFILING_)
                stop_dma_out();
#endif
            }

            /* copy additiona; 3x1 reference MBs into IRMB */
#if defined(_PROFILING_)
            start_timer();
#endif
            if (x < dec->mb_width - 1)
            {
                FuncPtr_DMA_one_column[y][x + 1] (dec, x + 1, y);
            }
#if defined(_PROFILING_)
            stop_dma_in_2();
#endif
            /* MB not_coded */
            if (!BitstreamGetBit(bs))   /* coded MB */
            {
                mcbpc = get_mcbpc_inter(bs);
                mb->mode = mcbpc & 7;
                cbpc = (mcbpc >> 4);
                acpred_flag = 0;
                intra = (mb->mode == MODE_INTRA || mb->mode == MODE_INTRA_Q);
                if (intra)
                {
                    acpred_flag = BitstreamGetBit(bs);
                }
                if (mb->mode == MODE_STUFFING)
                {
                    // DEBUG("Stuffed MBs");
                    goto next_video_packet;
                }
                cbpy = get_cbpy(bs, intra);
                cbp = (cbpy << 2) | cbpc;
                mb->quant = (running_qp) ? running_qp : quant;
                if (mb->mode == MODE_INTER_Q || mb->mode == MODE_INTRA_Q)
                {
                    mb->quant += dquant_table[BitstreamGetBits(bs, 2)];
                    if (mb->quant > 31)
                    {
                        mb->quant = 31;
                    }
                    else if (mb->quant < 1)
                    {
                        mb->quant = 1;
                    }
                }

                if (running_qp == 0)
                {
                    running_qp = mb->quant;
                }

                if (mb->mode == MODE_INTER || mb->mode == MODE_INTER_Q)
                {
#if defined(_PROFILING_)
                    start_timer();
#endif
                    get_motion_vector(dec, bs, x, y, 0, &mb->mvs[0], fcode);
#if defined(_PROFILING_)
                    stop_get_mv_timer();
#endif
                    mb->mvs[1].x = mb->mvs[2].x = mb->mvs[3].x = mb->mvs[0].x;
                    mb->mvs[1].y = mb->mvs[2].y = mb->mvs[3].y = mb->mvs[0].y;
                }
                else if (mb->mode == MODE_INTER4V)
                {
#if defined(_PROFILING_)
                    start_timer();
#endif
                    get_motion_vector(dec, bs, x, y, 0, &mb->mvs[0], fcode);
                    get_motion_vector(dec, bs, x, y, 1, &mb->mvs[1], fcode);
                    get_motion_vector(dec, bs, x, y, 2, &mb->mvs[2], fcode);
                    get_motion_vector(dec, bs, x, y, 3, &mb->mvs[3], fcode);
#if defined(_PROFILING_)
                    stop_get_mv_timer();
#endif
                }
                else // MODE_INTRA, MODE_INTRA_Q
                {
                    mb->mvs[0].x = mb->mvs[1].x = mb->mvs[2].x = mb->mvs[3].x = 0;
                    mb->mvs[0].y = mb->mvs[1].y = mb->mvs[2].y = mb->mvs[3].y = 0;
                    decoder_mbintra(dec, mb, x, y, acpred_flag, cbp, bs,
                                    running_qp, intra_dc_threshold);
                    running_qp = mb->quant;
                    goto next_video_packet;
                }

                decoder_mbinter(dec, mb, x, y, acpred_flag, cbp, bs, quant, rounding);
                running_qp = mb->quant;
            }
            else /* not coded */
            {
                mb->mode = MODE_NOT_CODED;
                memset((void *) mb->mvs, 0, 4 * sizeof(VECTOR));
#if defined(_PROFILING_)
                start_timer();
#endif
                z = (x % (local_ref_column_size - 2)); //2008/1/10 05:16 pm x=6=>x=0  x=7=>x=1       
                temp_size_for_stride = local_ref_column_size * 16;

                //Y compoment
                src = (uint8 *) ((uint32) dec->local_refn.y + (16 * z)); //ok2007/12/5 03:13 pm 1206 test 150 frames ok
                dst = (uint8 *) ((uint32) dec->local_cur.y + (16 * (x % local_cur_column_size)));
                for (i = 0; i < 16; i++)
                {
                    memcpy(dst, src, 16);
                    dst = (uint8 *) ((uint32) dst + local_cur_column_size * 16);
                    src = (uint8 *) ((uint32) src + temp_size_for_stride);
                }

                //U compoment 
                src = (uint8 *) ((uint32) dec->local_refn.u + (8 * z));
                dst = (uint8 *) ((uint32) dec->local_cur.u + (8 * (x % local_cur_column_size)));
                for (i = 0; i < 8; i++)
                {
                    memcpy(dst, src, 8);
                    dst = (uint8 *) ((uint32) dst + local_cur_column_size * 8);
                    src = (uint8 *) ((uint32) src + (temp_size_for_stride >> 1));
                }

                //V compoment 
                src = (uint8 *) ((uint32) dec->local_refn.v + (8 * z));
                dst = (uint8 *) ((uint32) dec->local_cur.v + (8 * (x % local_cur_column_size)));
                for (i = 0; i < 8; i++)
                {
                    memcpy(dst, src, 8);
                    dst = (uint8 *) ((uint32) dst + local_cur_column_size * 8);
                    src = (uint8 *) ((uint32) src + (temp_size_for_stride >> 1));
                }
#if defined(_PROFILING_)
                stop_comp_timer();  //stop_dma_ds to, because it's a zero motion compensation
#endif
//////////////////////new  end                                      
            }
          next_video_packet:
            /* Decode video packet header, if any.  We must check if there */
            /* is a valid stuffing pattern before the next resync_marker.  */
            /* Otherwise, it is possible that some uncoded MBs will be     */
            /* dropped by the decoder.                 CJ Tsai, Feb/7/2003 */
            if (!valid_stuffing(bs))
            {
                continue;
            }

            if (BitstreamShowBitsByteAlign(bs, resync_marker_length) == 1)
            {
                decode_video_packet_header(dec, bs, fcode, &quant);
                running_qp = 0;
                slice_no++;
            }
        }
        slice_idx++;
        ///////////////////////////////////new

        //change for slice base 2008/4/30 03:26pm
        if (y < (mb_row + ROW_PER_SLICE - 1))
        {
#if defined(_PROFILING_)
            start_timer();
#endif
            FuncPtr_DMA[y + 1] (dec, y + 1);
#if defined(_PROFILING_)
            stop_dma_in_3();
#endif
        }


        /* move the last few MBs per row in ICMB to SDRAM */
#if defined(_PROFILING_)
        start_timer();
#endif
		/* move Luma ICMB to SDRAM */
        src_obj.staddr = (uint32) dec->local_cur.y;
        src_obj.addressing_frmIndex = (local_cur_column_size * 16 -
             (x % local_cur_column_size) * 16) * 2 + 1;
        src_obj.addressing_elmIndex = 1;
        dst_obj.staddr = (uint32) ((uint32) dec->cur.y +
                                   (uint32) (y << 4) * dec->edged_width) +
                                   (uint32) (x / local_cur_column_size) *
                                            (local_cur_column_size * 16);
        //(stride-elecnt)*2+1;
        dst_obj.addressing_frmIndex = (dec->edged_width - (x % local_cur_column_size) * 16) * 2 + 1;
        dst_obj.addressing_elmIndex = 1;
        channel_obj.elmCnt = (x % local_cur_column_size) * 16;
        channel_obj.frmCnt = 16;
        channel_obj.channel_number = 5;
        while (flag[5] == 0);
        dma_api_ssd_dd(src_obj, dst_obj, channel_obj);

		/* move Chroma U ICMB to SDRAM */
        src_obj.staddr = (uint32) dec->local_cur.u;
        src_obj.addressing_frmIndex = (local_cur_column_size * 8 -
                                       (x % local_cur_column_size) * 8) * 2 + 1;
        src_obj.addressing_elmIndex = 1;
        dst_obj.staddr = (uint32) (dec->cur.u + (y << 3) * (dec->edged_width >> 1)) +
                                  (x / local_cur_column_size) * (local_cur_column_size * 8);
        //(stride-elecnt)*2+1;
        dst_obj.addressing_frmIndex = ((dec->edged_width >> 1) -
                                       (x % local_cur_column_size) * 8) * 2 + 1;
        dst_obj.addressing_elmIndex = 1;
        channel_obj.elmCnt = (x % local_cur_column_size) * 8;
        channel_obj.frmCnt = 8;
        channel_obj.channel_number = 4;
        while (flag[4] == 0);
        dma_api_ssd_dd(src_obj, dst_obj, channel_obj);

		/* move Chroma V ICMB to SDRAM */
        src_obj.staddr = (uint32) dec->local_cur.v;
        src_obj.addressing_frmIndex = (local_cur_column_size * 8 -
                                      (x % local_cur_column_size) * 8) * 2 + 1;
        src_obj.addressing_elmIndex = 1;
        dst_obj.staddr = (uint32) (dec->cur.v + (y << 3) * (dec->edged_width >> 1)) +
                                  (x / local_cur_column_size) * (local_cur_column_size * 8);
        //(stride-elecnt)*2+1;
        dst_obj.addressing_frmIndex = ((dec->edged_width >> 1) -
                                       (x % local_cur_column_size) * 8) * 2 + 1;
        dst_obj.addressing_elmIndex = 1;
        channel_obj.elmCnt = (x % local_cur_column_size) * 8;
        channel_obj.frmCnt = 8;
        channel_obj.channel_number = 1;
        while (flag[1] == 0);
        dma_api_ssd_dd(src_obj, dst_obj, channel_obj);
#if defined(_PROFILING_)
        stop_dma_out();
#endif
    }
}

Bitstream bs_slice[SLICE_NUMBER_PER_FRAME];

void
task_initialization_for_a_video_frame(DECODER *dec, Bitstream *bs, int frame_type)
{
    uint    extended_num_mb = (dec->mb_height + 1) * (dec->mb_width + 1);
    int     temp_skip;
    uint32  temp_value;
    int     i;
    memset(dec->slice, 0, extended_num_mb * sizeof(xint));
    if (frame_type == P_VOP)
    {
        image_swap(&dec->cur, &dec->refn);
    }

    memcpy(&bs_slice[0], bs, sizeof(Bitstream));
    for (i = 1; i < SLICE_NUMBER_PER_FRAME; i++)
    {
        temp_skip = 8 - (bs->pos % 8);
        BitstreamSkip(bs, temp_skip);
        temp_value = BitstreamGetBits_notforward(bs, 17);
        while (temp_value != 0x01)
        {
            BitstreamSkip(bs, 8);
            temp_value = BitstreamGetBits_notforward(bs, 17);
        }
        memcpy(&bs_slice[i], bs, sizeof(Bitstream));
    }
}

uint32  rounding;
uint32  quant;
uint32  fcode;
uint32  intra_dc_threshold;

xint
task_initialization(DEC_CTRL *vdec_obj)
{
    DECODER *dec = (DECODER *) vdec_obj->handle;
    Bitstream bs;

    start_global_timer();

    BitstreamInit(&bs, vdec_obj->bitstream, vdec_obj->length);
#if defined(_PROFILING_)
    start_timer();
#endif

    switch (BitstreamReadHeaders(&bs, dec, &rounding, &quant, &fcode, &intra_dc_threshold))
    {
    case P_VOP:
#if defined(_PROFILING_)
        stop_bitstreamreadheaders_timer();
#endif
        vdec_obj->type = 1;
        task_initialization_for_a_video_frame(dec, &bs, P_VOP);
        break;

    case I_VOP:
#if defined(_PROFILING_)
        stop_bitstreamreadheaders_timer();
#endif
        vdec_obj->type = 0;
        task_initialization_for_a_video_frame(dec, &bs, I_VOP);
        break;

    case B_VOP:
#if defined(_PROFILING_)
        stop_bitstreamreadheaders_timer();
#endif
        vdec_obj->type = 2;
        break;

    case N_VOP:
#if defined(_PROFILING_)
        stop_bitstreamreadheaders_timer();
#endif
        vdec_obj->type = 3;
        break;

    default:
        stop_bitstreamreadheaders_timer();
        return 1;
    }

    vdec_obj->timestamp = dec->timestamp;
#if defined(_PROFILING_)
    start_timer();
#endif
#ifdef LCD_OUT
    image_output(&dec->cur, dec->width, dec->height, dec->edged_width,
                 vdec_obj->image, vdec_obj->stride, 1);
    //yuv420rgb16_QVGA((uint16 *)output_yuv_address_Y_UP,(uint16 *)output_yuv_address_U,(uint16 *)output_yuv_address_V,(uint16 *)FRAME_BUFFER_address, 320, 240, 1);    
#endif
#if defined(_PROFILING_)
    stop_conv_timer();
#endif
#if defined(_PROFILING_)
    stop_global_timer();
#endif
    return 0;
}
